# !git lfs install
# !git clone https://huggingface.co/datasets/parambharat/tamil_asr_corpus
# !add-apt-repository -y ppa:jonathonf/ffmpeg-4
# !apt update
# !apt install -y ffmpeg
# !pip uninstall -y transformers datasets
# !pip install audiomentations
# !pip install git+https://github.com/huggingface/datasets
# !pip install git+https://github.com/huggingface/transformers
# !pip install librosa soundfile
# !pip install "evaluate>=0.3.0"
# !pip install jiwer
# !pip install more-itertools
# !pip install wandb
# !pip install bitsandbytes
# !pip install "holoviews[recommended]"
%set_env WANDB_LOG_MODEL=True
%set_env WANDB_WATCH=all
%set_env WANDB_NOTEBOOK_NAME=whisper_small_ta.ipynb
env: WANDB_LOG_MODEL=True env: WANDB_WATCH=all env: WANDB_NOTEBOOK_NAME=whisper_small_ta.ipynb
import torch
from torch.utils.data import IterableDataset
from io import StringIO
import string
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import wandb
from IPython.display import clear_output
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import numpy as np
from transformers import WhisperForConditionalGeneration
from transformers import WhisperProcessor
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
from huggingface_hub import notebook_login
from transformers import TrainerCallback
from transformers.integrations import WandbCallback
from transformers.trainer_pt_utils import IterableDatasetShard
from datasets import Dataset, IterableDatasetDict, load_dataset, interleave_datasets, Audio
from datasets import load_dataset, Audio
import evaluate
from pathlib import Path
import pandas as pd
import holoviews as hv
import panel as pn
import tempfile
from bokeh.resources import INLINE
hv.extension("bokeh", logo=False)
import jiwer
torch.cuda.is_available()
True
# wandb.login()
# notebook_login()
run = wandb.init(project="whisper_finetuning", job_type="fine-tuning", group="small-ta", resume="must", id="17xqqp5b")
wandb: Currently logged in as: parambharat. Use `wandb login --relogin` to force relogin
VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016671159649983263, max=1.0…
/home/ubuntu/whisper-finetuning/notebooks/wandb/run-20221212_073428-17xqqp5b
def load_data_splits(is_streaming=True, stopping_strategy="all_exhausted"):
dataset_dict = {}
data_dict = load_dataset("../data/tamil_asr_corpus/", streaming=is_streaming)
return data_dict
dataset_dict = load_data_splits()
augment_waveform = Compose([
AddGaussianNoise(min_amplitude=0.005, max_amplitude=0.015, p=0.3),
TimeStretch(min_rate=0.8, max_rate=1.25, p=0.3, leave_length_unchanged=False),
PitchShift(min_semitones=-4, max_semitones=4, p=0.3)
,])
def augment_dataset(batch):
audio = batch["audio"]["array"]
# apply augmentation
augmented_audio = augment_waveform(samples=audio, sample_rate=16000)
batch["audio"]["array"] = augmented_audio
return batch
# call augment dataset on the training set
dataset_dict["train"] = dataset_dict["train"].map(augment_dataset)
feature_extractor = WhisperFeatureExtractor.from_pretrained(
"openai/whisper-small"
)
tokenizer = WhisperTokenizer.from_pretrained(
"openai/whisper-small",
language="Tamil",
task="transcribe",
model_max_length=225
)
processor = WhisperProcessor.from_pretrained(
"openai/whisper-small",
language="Tamil",
task="transcribe",
model_max_length=225
)
def fix_sentence(sentence):
transcription = sentence
if transcription.startswith('"') and transcription.endswith('"'):
# we can remove trailing quotation marks as they do not affect the transcription
transcription = transcription[1:-1]
if transcription[-1] not in [".", "?", "!"]:
# append a full-stop to sentences that do not end in punctuation
transcription = transcription + "."
transcription = transcription[:-1].translate(str.maketrans('', '', string.punctuation)) + transcription[-1]
return transcription
def prepare_dataset(examples):
# compute log-Mel input features from input audio array
audio = examples["audio"]
examples["input_features"] = feature_extractor(
audio["array"], sampling_rate=16000).input_features[0]
sentences = fix_sentence(examples["sentence"])
# encode target text to label ids
examples["labels"] = tokenizer(sentences, max_length=225, truncation=True).input_ids
return examples
def filter_empty_strings(sentence):
if len(sentence) < 2:
return False
else: return True
for k in dataset_dict:
dataset_dict[k] = dataset_dict[k].filter(filter_empty_strings, input_columns=["sentence"])
for k in dataset_dict:
dataset_dict[k] = dataset_dict[k].map(
prepare_dataset,).with_format("torch")
dataset_dict["train"] = dataset_dict["train"].shuffle(buffer_size=500)
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
processor: Any
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
# split inputs and labels since they have to be of different lengths and need different padding methods
# first treat the audio inputs by simply returning torch tensors
input_features = [{"input_features": feature["input_features"]} for feature in features]
batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
# get the tokenized label sequences
label_features = [{"input_ids": self.processor.tokenizer.truncate_sequences(feature["labels"])[0]}
for feature in features]
# pad the labels to max length
labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt",)
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
# if bos token is appended in previous tokenization step,
# cut bos token here as it's append later anyways
if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
labels = labels[:, 1:]
batch["labels"] = labels
return batch
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
metric = evaluate.load("wer")
# evaluate with the 'normalised' WER
do_normalize_eval = True
def compute_metrics(pred):
pred_ids = pred.predictions
label_ids = pred.label_ids
# replace -100 with the pad_token_id
label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
# we do not want to group tokens when computing the metrics
pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, normalize=do_normalize_eval)
label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True, normalize=do_normalize_eval)
wer = 100 * metric.compute(predictions=pred_str, references=label_str)
return {"wer": wer}
model = WhisperForConditionalGeneration.from_pretrained("./artifacts/model-17xqqp5b:v0", use_cache=False)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False
# trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
class ShuffleCallback(TrainerCallback):
def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
if isinstance(train_dataloader.dataset, IterableDatasetShard):
pass # set_epoch() is handled by the Trainer
elif isinstance(train_dataloader.dataset, IterableDataset):
train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)
def load_samples_dataset(dataset, num_samples=100):
samples = []
for i, item in enumerate(dataset):
samples.append(item)
if i == (num_samples-1):
break
sample_dataset = Dataset.from_list(samples)
return sample_dataset
def compute_spectrograms(example):
waveform = example["audio"]["array"]
specs = feature_extractor(waveform, sampling_rate=16000, padding="do_not_pad").input_features[0]
return {"spectrogram": specs}
def record_to_html(sample_record):
audio_array = np.array(sample_record["audio"]["array"])
audio_sr = sample_record["audio"]["sampling_rate"]
audio_duration = sample_record["length"]
audio_spectrogram = np.array(sample_record["spectrogram"])
bounds = (0,0, audio_duration, audio_spectrogram.max())
waveform_int = np.int16(audio_array * 32767)
hv_audio = pn.pane.Audio(waveform_int, sample_rate=audio_sr, name='Audio', throttle=500)
slider = pn.widgets.FloatSlider(end=audio_duration, visible=False, step=0.001)
line_audio = hv.VLine(0).opts(color='black')
line_spec = hv.VLine(0).opts(color='red')
slider.jslink(hv_audio, value='time', bidirectional=True)
slider.jslink(line_audio, value='glyph.location')
slider.jslink(line_spec, value='glyph.location')
time = np.linspace(0, audio_duration, num=len(audio_array))
line_plot_hv = hv.Curve(
(time, audio_array), ["Time (s)", "amplitude"]).opts(
width=500, height=150, axiswise=True) * line_audio
hv_spec_gram = hv.Image(
audio_spectrogram, bounds=(bounds), kdims=["Time (s)", "Frequency (hz)"]).opts(
width=500, height=150, labelled=[], axiswise=True, color_levels=512)* line_spec
combined = pn.Row(hv_audio, hv_spec_gram, line_plot_hv, slider)
audio_html = StringIO()
combined.save(audio_html)
return audio_html
def dataset_to_records(dataset):
records = []
for item in dataset:
record = {}
record["audio_with_spec"] = wandb.Html(record_to_html(item))
record["sentence"] = item["sentence"]
record["length"] = item["length"]
records.append(record)
records = pd.DataFrame(records)
return records
def decode_predictions(trainer, predictions):
pred_ids = predictions.predictions
pred_str = trainer.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, )
return pred_str
def compute_measures(predictions, labels):
measures = [jiwer.compute_measures(ls, ps) for ps, ls in zip(predictions, labels)]
measures_df = pd.DataFrame(measures)[["wer", "hits", "substitutions", "deletions", "insertions"]]
return measures_df
class WandbProgressResultsCallback(WandbCallback):
def __init__(self, trainer, sample_dataset):
super().__init__()
self.trainer = trainer
self.sample_dataset = sample_dataset
self.records_df = dataset_to_records(sample_dataset)
def on_log(self, args, state, control, model=None, logs=None, **kwargs):
super().on_log(args, state, control, model, logs)
predictions = trainer.predict(self.sample_dataset)
predictions = decode_predictions(self.trainer, predictions)
measures_df = compute_measures(predictions, self.records_df["sentence"].tolist())
records_df = pd.concat([self.records_df, measures_df], axis=1)
records_df["prediction"] = predictions
records_df["step"] = state.global_step
records_table = self._wandb.Table(dataframe=records_df)
self._wandb.log({"sample_predictions": records_table})
def on_save(self, args, state, control, model=None, tokenizer=None, **kwargs):
if self._wandb is None:
return
if self._log_model and self._initialized and state.is_world_process_zero:
with tempfile.TemporaryDirectory() as temp_dir:
self.trainer.save_model(temp_dir)
metadata = (
{
k: v
for k, v in dict(self._wandb.summary).items()
if isinstance(v, numbers.Number) and not k.startswith("_")
}
if not args.load_best_model_at_end
else {
f"eval/{args.metric_for_best_model}": state.best_metric,
"train/total_floss": state.total_flos,
}
)
artifact = self._wandb.Artifact(
name=f"model-{self._wandb.run.id}",
type="model", metadata=metadata)
for f in Path(temp_dir).glob("*"):
if f.is_file():
with artifact.new_file(f.name, mode="wb") as fa:
fa.write(f.read_bytes())
self._wandb.run.log_artifact(artifact)
training_args = Seq2SeqTrainingArguments(
output_dir="../models/whisper-small-ta", # change to a repo name of your choice
per_device_train_batch_size=32,
gradient_accumulation_steps=2, # increase by 2x for every 2x decrease in batch size
learning_rate=1e-5,
save_total_limit=4,
warmup_steps=500,
max_steps=5000,
gradient_checkpointing=True,
fp16=True,
# fp16_full_eval=True,
optim="adamw_bnb_8bit",
evaluation_strategy="steps",
per_device_eval_batch_size=16,
predict_with_generate=True,
generation_max_length=225,
save_steps=500,
eval_steps=500,
logging_steps=250,
report_to="none",
load_best_model_at_end=True,
metric_for_best_model="wer",
greater_is_better=False,
push_to_hub=True,
hub_strategy="checkpoint",
remove_unused_columns=False,
ignore_data_skip=True
)
PyTorch: setting up devices
samples_dataset = load_samples_dataset(dataset_dict["test"]).map(compute_spectrograms)
0%| | 0/100 [00:00<?, ?ex/s]
trainer = Seq2SeqTrainer(
args=training_args,
model=model,
train_dataset=dataset_dict["train"],
eval_dataset=samples_dataset,
data_collator=data_collator,
compute_metrics=compute_metrics,
tokenizer=processor,
callbacks=[ShuffleCallback()],
)
/home/ubuntu/whisper-finetuning/notebooks/../models/whisper-small-ta is already a clone of https://huggingface.co/parambharat/whisper-small-ta. Make sure you pull the latest changes with `repo.git_pull()`. max_steps is given, it will override any value given in num_train_epochs Using cuda_amp half precision backend
progress_callback = WandbProgressResultsCallback(trainer, samples_dataset)
clear_output()
trainer.add_callback(progress_callback)
# model.save_pretrained(training_args.output_dir)
# processor.save_pretrained(training_args.output_dir)
trainer.train()
***** Running training ***** Num examples = 320000 Num Epochs = 9223372036854775807 Instantaneous batch size per device = 32 Total train batch size (w. parallel, distributed & accumulation) = 64 Gradient Accumulation steps = 2 Total optimization steps = 5000 Number of trainable parameters = 241734912 Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
| Step | Training Loss | Validation Loss | Wer |
|---|---|---|---|
| 500 | 0.337400 | 0.257853 | 23.380419 |
| 1000 | 0.290000 | 0.226021 | 20.993668 |
| 1500 | 0.252200 | 0.213904 | 20.068193 |
| 2000 | 0.233800 | 0.202510 | 19.678519 |
| 2500 | 0.223000 | 0.197901 | 18.314661 |
| 3000 | 0.211000 | 0.192680 | 17.827569 |
| 3500 | 0.203200 | 0.186459 | 17.389187 |
| 4000 | 0.197800 | 0.183870 | 17.535314 |
| 4500 | 0.197200 | 0.181212 | 17.096931 |
| 5000 | 0.189400 | 0.180317 | 17.145641 |
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Evaluation *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
Saving model checkpoint to ../models/whisper-small-ta/checkpoint-500
Configuration saved in ../models/whisper-small-ta/checkpoint-500/config.json
Model weights saved in ../models/whisper-small-ta/checkpoint-500/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/checkpoint-500/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/checkpoint-500/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/checkpoint-500/added_tokens.json
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Saving model checkpoint to /tmp/tmp6eo55h_b
Configuration saved in /tmp/tmp6eo55h_b/config.json
Model weights saved in /tmp/tmp6eo55h_b/pytorch_model.bin
Feature extractor saved in /tmp/tmp6eo55h_b/preprocessor_config.json
tokenizer config file saved in /tmp/tmp6eo55h_b/tokenizer_config.json
Special tokens file saved in /tmp/tmp6eo55h_b/special_tokens_map.json
added tokens file saved in /tmp/tmp6eo55h_b/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ta
Configuration saved in ../models/whisper-small-ta/config.json
Model weights saved in ../models/whisper-small-ta/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ta
a6819b0..07ab463 main -> main
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Evaluation *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
Saving model checkpoint to ../models/whisper-small-ta/checkpoint-1000
Configuration saved in ../models/whisper-small-ta/checkpoint-1000/config.json
Model weights saved in ../models/whisper-small-ta/checkpoint-1000/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/checkpoint-1000/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/checkpoint-1000/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/checkpoint-1000/added_tokens.json
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Saving model checkpoint to /tmp/tmpe3_fznmd
Configuration saved in /tmp/tmpe3_fznmd/config.json
Model weights saved in /tmp/tmpe3_fznmd/pytorch_model.bin
Feature extractor saved in /tmp/tmpe3_fznmd/preprocessor_config.json
tokenizer config file saved in /tmp/tmpe3_fznmd/tokenizer_config.json
Special tokens file saved in /tmp/tmpe3_fznmd/special_tokens_map.json
added tokens file saved in /tmp/tmpe3_fznmd/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ta
Configuration saved in ../models/whisper-small-ta/config.json
Model weights saved in ../models/whisper-small-ta/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ta
07ab463..8a53b5e main -> main
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Evaluation *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
Saving model checkpoint to ../models/whisper-small-ta/checkpoint-1500
Configuration saved in ../models/whisper-small-ta/checkpoint-1500/config.json
Model weights saved in ../models/whisper-small-ta/checkpoint-1500/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/checkpoint-1500/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/checkpoint-1500/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/checkpoint-1500/added_tokens.json
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Saving model checkpoint to /tmp/tmpm7i5h07i
Configuration saved in /tmp/tmpm7i5h07i/config.json
Model weights saved in /tmp/tmpm7i5h07i/pytorch_model.bin
Feature extractor saved in /tmp/tmpm7i5h07i/preprocessor_config.json
tokenizer config file saved in /tmp/tmpm7i5h07i/tokenizer_config.json
Special tokens file saved in /tmp/tmpm7i5h07i/special_tokens_map.json
added tokens file saved in /tmp/tmpm7i5h07i/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ta
Configuration saved in ../models/whisper-small-ta/config.json
Model weights saved in ../models/whisper-small-ta/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ta
8a53b5e..dcdf420 main -> main
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Evaluation *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
Saving model checkpoint to ../models/whisper-small-ta/checkpoint-2000
Configuration saved in ../models/whisper-small-ta/checkpoint-2000/config.json
Model weights saved in ../models/whisper-small-ta/checkpoint-2000/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/checkpoint-2000/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/checkpoint-2000/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/checkpoint-2000/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/checkpoint-2000/added_tokens.json
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Saving model checkpoint to /tmp/tmptax5iloo
Configuration saved in /tmp/tmptax5iloo/config.json
Model weights saved in /tmp/tmptax5iloo/pytorch_model.bin
Feature extractor saved in /tmp/tmptax5iloo/preprocessor_config.json
tokenizer config file saved in /tmp/tmptax5iloo/tokenizer_config.json
Special tokens file saved in /tmp/tmptax5iloo/special_tokens_map.json
added tokens file saved in /tmp/tmptax5iloo/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ta
Configuration saved in ../models/whisper-small-ta/config.json
Model weights saved in ../models/whisper-small-ta/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ta
dcdf420..8422cd0 main -> main
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Evaluation *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
Saving model checkpoint to ../models/whisper-small-ta/checkpoint-2500
Configuration saved in ../models/whisper-small-ta/checkpoint-2500/config.json
Model weights saved in ../models/whisper-small-ta/checkpoint-2500/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/checkpoint-2500/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/checkpoint-2500/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/checkpoint-2500/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/checkpoint-2500/added_tokens.json
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Deleting older checkpoint [../models/whisper-small-ta/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to /tmp/tmpvpcmikf5
Configuration saved in /tmp/tmpvpcmikf5/config.json
Model weights saved in /tmp/tmpvpcmikf5/pytorch_model.bin
Feature extractor saved in /tmp/tmpvpcmikf5/preprocessor_config.json
tokenizer config file saved in /tmp/tmpvpcmikf5/tokenizer_config.json
Special tokens file saved in /tmp/tmpvpcmikf5/special_tokens_map.json
added tokens file saved in /tmp/tmpvpcmikf5/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ta
Configuration saved in ../models/whisper-small-ta/config.json
Model weights saved in ../models/whisper-small-ta/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ta
8422cd0..92b9c07 main -> main
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Evaluation *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
Saving model checkpoint to ../models/whisper-small-ta/checkpoint-3000
Configuration saved in ../models/whisper-small-ta/checkpoint-3000/config.json
Model weights saved in ../models/whisper-small-ta/checkpoint-3000/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/checkpoint-3000/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/checkpoint-3000/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/checkpoint-3000/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/checkpoint-3000/added_tokens.json
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Deleting older checkpoint [../models/whisper-small-ta/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to /tmp/tmphl8t83yi
Configuration saved in /tmp/tmphl8t83yi/config.json
Model weights saved in /tmp/tmphl8t83yi/pytorch_model.bin
Feature extractor saved in /tmp/tmphl8t83yi/preprocessor_config.json
tokenizer config file saved in /tmp/tmphl8t83yi/tokenizer_config.json
Special tokens file saved in /tmp/tmphl8t83yi/special_tokens_map.json
added tokens file saved in /tmp/tmphl8t83yi/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ta
Configuration saved in ../models/whisper-small-ta/config.json
Model weights saved in ../models/whisper-small-ta/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ta
92b9c07..b375c14 main -> main
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Evaluation *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
Saving model checkpoint to ../models/whisper-small-ta/checkpoint-3500
Configuration saved in ../models/whisper-small-ta/checkpoint-3500/config.json
Model weights saved in ../models/whisper-small-ta/checkpoint-3500/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/checkpoint-3500/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/checkpoint-3500/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/checkpoint-3500/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/checkpoint-3500/added_tokens.json
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Deleting older checkpoint [../models/whisper-small-ta/checkpoint-1500] due to args.save_total_limit
Saving model checkpoint to /tmp/tmpg3qb4don
Configuration saved in /tmp/tmpg3qb4don/config.json
Model weights saved in /tmp/tmpg3qb4don/pytorch_model.bin
Feature extractor saved in /tmp/tmpg3qb4don/preprocessor_config.json
tokenizer config file saved in /tmp/tmpg3qb4don/tokenizer_config.json
Special tokens file saved in /tmp/tmpg3qb4don/special_tokens_map.json
added tokens file saved in /tmp/tmpg3qb4don/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ta
Configuration saved in ../models/whisper-small-ta/config.json
Model weights saved in ../models/whisper-small-ta/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ta
b375c14..58592c9 main -> main
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Evaluation *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
Saving model checkpoint to ../models/whisper-small-ta/checkpoint-4000
Configuration saved in ../models/whisper-small-ta/checkpoint-4000/config.json
Model weights saved in ../models/whisper-small-ta/checkpoint-4000/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/checkpoint-4000/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/checkpoint-4000/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/checkpoint-4000/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/checkpoint-4000/added_tokens.json
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Deleting older checkpoint [../models/whisper-small-ta/checkpoint-2000] due to args.save_total_limit
Saving model checkpoint to /tmp/tmpn8kp2k4c
Configuration saved in /tmp/tmpn8kp2k4c/config.json
Model weights saved in /tmp/tmpn8kp2k4c/pytorch_model.bin
Feature extractor saved in /tmp/tmpn8kp2k4c/preprocessor_config.json
tokenizer config file saved in /tmp/tmpn8kp2k4c/tokenizer_config.json
Special tokens file saved in /tmp/tmpn8kp2k4c/special_tokens_map.json
added tokens file saved in /tmp/tmpn8kp2k4c/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ta
Configuration saved in ../models/whisper-small-ta/config.json
Model weights saved in ../models/whisper-small-ta/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ta
58592c9..e1b1724 main -> main
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Evaluation *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
Saving model checkpoint to ../models/whisper-small-ta/checkpoint-4500
Configuration saved in ../models/whisper-small-ta/checkpoint-4500/config.json
Model weights saved in ../models/whisper-small-ta/checkpoint-4500/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/checkpoint-4500/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/checkpoint-4500/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/checkpoint-4500/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/checkpoint-4500/added_tokens.json
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Deleting older checkpoint [../models/whisper-small-ta/checkpoint-2500] due to args.save_total_limit
Saving model checkpoint to /tmp/tmpm31_ft97
Configuration saved in /tmp/tmpm31_ft97/config.json
Model weights saved in /tmp/tmpm31_ft97/pytorch_model.bin
Feature extractor saved in /tmp/tmpm31_ft97/preprocessor_config.json
tokenizer config file saved in /tmp/tmpm31_ft97/tokenizer_config.json
Special tokens file saved in /tmp/tmpm31_ft97/special_tokens_map.json
added tokens file saved in /tmp/tmpm31_ft97/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ta
Configuration saved in ../models/whisper-small-ta/config.json
Model weights saved in ../models/whisper-small-ta/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ta
e1b1724..5245ff0 main -> main
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
***** Running Evaluation *****
Num examples = 100
Batch size = 16
***** Running Prediction *****
Num examples = 100
Batch size = 16
Saving model checkpoint to ../models/whisper-small-ta/checkpoint-5000
Configuration saved in ../models/whisper-small-ta/checkpoint-5000/config.json
Model weights saved in ../models/whisper-small-ta/checkpoint-5000/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/checkpoint-5000/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/checkpoint-5000/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/checkpoint-5000/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/checkpoint-5000/added_tokens.json
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Deleting older checkpoint [../models/whisper-small-ta/checkpoint-3000] due to args.save_total_limit
Saving model checkpoint to /tmp/tmpf7aeicqp
Configuration saved in /tmp/tmpf7aeicqp/config.json
Model weights saved in /tmp/tmpf7aeicqp/pytorch_model.bin
Feature extractor saved in /tmp/tmpf7aeicqp/preprocessor_config.json
tokenizer config file saved in /tmp/tmpf7aeicqp/tokenizer_config.json
Special tokens file saved in /tmp/tmpf7aeicqp/special_tokens_map.json
added tokens file saved in /tmp/tmpf7aeicqp/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ta
Configuration saved in ../models/whisper-small-ta/config.json
Model weights saved in ../models/whisper-small-ta/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ta
5245ff0..0ea1415 main -> main
Training completed. Do not forget to share your model on huggingface.co/models =)
Loading best model from ../models/whisper-small-ta/checkpoint-4500 (score: 17.096931320019486).
***** Running Prediction *****
Num examples = 100
Batch size = 16
/home/ubuntu/whisper-finetuning/notebooks/../models/whisper-small-ta is already a clone of https://huggingface.co/parambharat/whisper-small-ta. Make sure you pull the latest changes with `repo.git_pull()`.
max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend
Saving model checkpoint to /tmp/tmp0p_p2txp
Configuration saved in /tmp/tmp0p_p2txp/config.json
Model weights saved in /tmp/tmp0p_p2txp/pytorch_model.bin
Feature extractor saved in /tmp/tmp0p_p2txp/preprocessor_config.json
tokenizer config file saved in /tmp/tmp0p_p2txp/tokenizer_config.json
Special tokens file saved in /tmp/tmp0p_p2txp/special_tokens_map.json
added tokens file saved in /tmp/tmp0p_p2txp/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ta
Configuration saved in ../models/whisper-small-ta/config.json
Model weights saved in ../models/whisper-small-ta/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ta
0ea1415..825c5f6 main -> main
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ta
825c5f6..93e26e3 main -> main
TrainOutput(global_step=5000, training_loss=0.23831620330810546, metrics={'train_runtime': 86386.8863, 'train_samples_per_second': 3.704, 'train_steps_per_second': 0.058, 'total_flos': 9.23473281024e+19, 'train_loss': 0.23831620330810546, 'epoch': 1.0})
kwargs = {
"language": "ta",
"model_name": "Whisper Small Ta - Bharat Ramanathan", # a 'pretty' name for your model
"finetuned_from": "openai/whisper-small",
"tasks": "automatic-speech-recognition",
"tags": "whisper-event",
}
trainer.push_to_hub(**kwargs)
Saving model checkpoint to ../models/whisper-small-ta
Configuration saved in ../models/whisper-small-ta/config.json
Model weights saved in ../models/whisper-small-ta/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ta/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ta/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ta/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ta/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}, 'metrics': [{'name': 'Wer', 'type': 'wer', 'value': 17.145640526059424}]}
To https://huggingface.co/parambharat/whisper-small-ta
93e26e3..d23d815 main -> main
wandb.finish()
VBox(children=(Label(value='9542.852 MB of 9542.852 MB uploaded (315.318 MB deduped)\r'), FloatProgress(value=…
| eval/loss | █▅▄▃▃▂▂▁▁▁ |
| eval/runtime | ▂▄▂▁█▁▂▃▂▅ |
| eval/samples_per_second | ▇▅▇▇▁█▇▆▇▄ |
| eval/steps_per_second | █▆██▁██▆█▆ |
| eval/wer | █▅▄▄▂▂▁▁▁▁ |
| train/epoch | ▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇████ |
| train/global_step | ▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████ |
| train/learning_rate | ▅██▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▁▁ |
| train/loss | █▇▆▅▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁ |
| train/total_flos | ▁ |
| train/train_loss | ▁ |
| train/train_runtime | ▁ |
| train/train_samples_per_second | ▁ |
| train/train_steps_per_second | ▁ |
| eval/loss | 0.18032 |
| eval/runtime | 108.2177 |
| eval/samples_per_second | 0.924 |
| eval/steps_per_second | 0.065 |
| eval/wer | 17.14564 |
| train/epoch | 1.0 |
| train/global_step | 5000 |
| train/learning_rate | 0.0 |
| train/loss | 0.1894 |
| train/total_flos | 9.23473281024e+19 |
| train/train_loss | 0.23832 |
| train/train_runtime | 86386.8863 |
| train/train_samples_per_second | 3.704 |
| train/train_steps_per_second | 0.058 |
./wandb/run-20221212_073428-17xqqp5b/logs